\chapter{Information-Theoretic Analysis}
\label{chap:information_theory}

\section{Foundations and Motivation}

Information theory quantifies uncertainty, dependence, and causal influence in hierarchical systems. We extend Shannon measures to layered architectures, enabling precise diagnostics of coordination, redundancy, and emergence.

\begin{definition}[Hierarchical entropy]
For level $\ell$, the aggregated state $S_\ell$ has entropy
\begin{equation}
H_\ell = -\sum_{s_\ell} \Prob(S_\ell = s_\ell) \log \Prob(S_\ell = s_\ell).
\end{equation}
\end{definition}

Cross-level dependencies use mutual information.

\begin{definition}[Cross-level mutual information]
\label{def:cross_level_mi}
\begin{equation}
I(S_\ell; S_{\ell'}) = \sum_{s_\ell, s_{\ell'}} \Prob(s_\ell, s_{\ell'}) \log \frac{\Prob(s_\ell, s_{\ell'})}{\Prob(s_\ell)\Prob(s_{\ell'})}.
\end{equation}
\end{definition}

\section{Directed Information Flow}

Hierarchical cooperation requires tracking directional influence.

\begin{definition}[Transfer entropy]
\label{def:transfer_entropy}
Given time series $S_\ell^{(t)}$ and $S_{\ell+1}^{(t)}$, the transfer entropy from level $\ell$ to $\ell+1$ is
\begin{equation}
TE_{\ell \rightarrow \ell+1} = \sum \Prob(s_{\ell+1}^{t+1}, s_\ell^t, s_{\ell+1}^t) \log \frac{\Prob(s_{\ell+1}^{t+1} | s_\ell^t, s_{\ell+1}^t)}{\Prob(s_{\ell+1}^{t+1} | s_{\ell+1}^t)}.
\end{equation}
\end{definition}

\begin{definition}[Aggregation efficiency]
\label{def:aggregation_efficiency}
The efficiency of compressing information from level $\ell$ into level $\ell+1$ is
\begin{equation}
\eta_{\ell \rightarrow \ell+1} = \frac{I(S_\ell; S_{\ell+1})}{H(S_\ell)}.
\end{equation}
\end{definition}

\begin{proposition}
\label{prop:aggregation_bound}
For any deterministic aggregation map $\Psi: \mathcal{S}_\ell \rightarrow \mathcal{S}_{\ell+1}$, $I(S_\ell; \Psi(S_\ell)) \leq H(S_\ell)$ with equality if $\Psi$ is injective.
\end{proposition}

\begin{proof}
Mutual information between a variable and a function thereof equals entropy reduction: $I(X; f(X)) = H(f(X)) - H(f(X)|X)$. Since $H(f(X)|X) = 0$, the value reduces to $H(f(X)) \leq H(X)$. Equality holds precisely when $f$ preserves all states.
\end{proof}

\section{Effective Information and Causal Emergence}

Correlation alone does not establish causal influence; we adopt effective information (EI) \cite{hoel2017}.

\begin{definition}[Effective information]
Given an intervention $\text{do}(S_\ell)$ that samples $S_\ell$ uniformly, the effective information on level $\ell+1$ is
\begin{equation}
EI_{\ell \rightarrow \ell+1} = I(S_{\ell+1}; S_\ell \mid \text{do}(S_\ell)).
\end{equation}
\end{definition}

\begin{definition}[Emergence strength]
\label{def:emergence_strength}
The causal emergence advantage of level $\ell+1$ over level $\ell$ is
\begin{equation}
\mathcal{E}_{\ell \rightarrow \ell+1} = \frac{EI_{\ell+1 \rightarrow \ell+1} - EI_{\ell \rightarrow \ell}}{EI_{\ell \rightarrow \ell}}.
\end{equation}
\end{definition}

\begin{theorem}[Emergence bounds]
\label{thm:emergence_bounds}
If each level has finite support and $EI_{\ell \rightarrow \ell} > 0$, then
\begin{equation}
0 \leq \mathcal{E}_{\ell \rightarrow \ell+1} \leq \frac{H_{\max}(S_{\ell+1}) - H_{\max}(S_\ell)}{H_{\min}(S_\ell)}.
\end{equation}
\end{theorem}

\begin{proof}
Effective information is upper-bounded by the entropy of the target level. Substituting into \cref{def:emergence_strength} yields the claimed inequality. Appendix~\ref{app:proofs} provides a detailed derivation.
\end{proof}

\section{Information Bottlenecks and Compression}

Hierarchical levels act as communication bottlenecks subject to capacity constraints.

\begin{definition}[Information bottleneck objective]
\label{def:information_bottleneck}
Given lower-level variable $X$, compressed representation $T$, and target $Y$, solve
\begin{equation}
\min_{P(T|X)} I(X; T) - \beta I(T; Y) \,\, \text{s.t.} \,\, Y \leftrightarrow X \leftrightarrow T.
\end{equation}
\end{definition}

\begin{theorem}[Phase transitions in information bottlenecks]
\label{thm:bottleneck_phases}
Critical values of $\beta$ mark qualitative changes in the optimal partitioning of $X$ into equivalence classes. At each critical $\beta_c$, new solution branches appear, analogous to symmetry-breaking phase transitions.
\end{theorem}

\begin{proof}
Differentiating the Lagrangian yields self-consistency equations. Bifurcation analysis shows solution multiplicity changes when the Hessian's smallest eigenvalue crosses zero. Full details reside in Appendix~\ref{app:proofs} and follow information bottleneck analyses documented in \cite{tishby1999}.
\end{proof}

\section{Metric Catalogue for Experiments}

We summarize metrics used in Chapters~\ref{chap:simulation}--\ref{chap:experiments}:
\begin{itemize}
    \item \textbf{State entropy} $H(S_t)$: monitors macro-order evolution.
    \item \textbf{Mutual information matrices}: quantify coordination between subgroups.
    \item \textbf{Transfer entropy}: gauges directional control efficacy.
    \item \textbf{Effective information}: distinguishes causal emergence from spurious correlation.
    \item \textbf{Information compression ratio}: $H(S_\ell)/H(S_{\ell-1})$ measures abstraction efficiency.
\end{itemize}

Instrumented estimators and sampling procedures are detailed in Appendix~\ref{app:stat_methods}.

\section{Integration with Theory and Practice}

Information-theoretic diagnostics close the loop between theory and experiments. Chapter~\ref{chap:stat_mech} analyzes phase transitions via entropy and mutual information peaks, Chapter~\ref{chap:stochastic} investigates randomness management through information gain, and Chapter~\ref{chap:multiagent} encodes communication policies maximizing information throughput subject to bandwidth constraints.

